{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from datetime import date\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#需要学习的：\n",
    "#numpy.ndarray.flatten https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.flatten.html\n",
    "#numpy flatten的用法：https://blog.csdn.net/qq_18433441/article/details/54916991\n",
    "#pandas 时间序列分析：https://www.cnblogs.com/foley/p/5582358.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date</th>\n",
       "      <th>Label</th>\n",
       "      <th>Top1</th>\n",
       "      <th>Top2</th>\n",
       "      <th>Top3</th>\n",
       "      <th>Top4</th>\n",
       "      <th>Top5</th>\n",
       "      <th>Top6</th>\n",
       "      <th>Top7</th>\n",
       "      <th>Top8</th>\n",
       "      <th>...</th>\n",
       "      <th>Top16</th>\n",
       "      <th>Top17</th>\n",
       "      <th>Top18</th>\n",
       "      <th>Top19</th>\n",
       "      <th>Top20</th>\n",
       "      <th>Top21</th>\n",
       "      <th>Top22</th>\n",
       "      <th>Top23</th>\n",
       "      <th>Top24</th>\n",
       "      <th>Top25</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2008-08-08</td>\n",
       "      <td>0</td>\n",
       "      <td>b\"Georgia 'downs two Russian warplanes' as cou...</td>\n",
       "      <td>b'BREAKING: Musharraf to be impeached.'</td>\n",
       "      <td>b'Russia Today: Columns of troops roll into So...</td>\n",
       "      <td>b'Russian tanks are moving towards the capital...</td>\n",
       "      <td>b\"Afghan children raped with 'impunity,' U.N. ...</td>\n",
       "      <td>b'150 Russian tanks have entered South Ossetia...</td>\n",
       "      <td>b\"Breaking: Georgia invades South Ossetia, Rus...</td>\n",
       "      <td>b\"The 'enemy combatent' trials are nothing but...</td>\n",
       "      <td>...</td>\n",
       "      <td>b'Georgia Invades South Ossetia - if Russia ge...</td>\n",
       "      <td>b'Al-Qaeda Faces Islamist Backlash'</td>\n",
       "      <td>b'Condoleezza Rice: \"The US would not act to p...</td>\n",
       "      <td>b'This is a busy day:  The European Union has ...</td>\n",
       "      <td>b\"Georgia will withdraw 1,000 soldiers from Ir...</td>\n",
       "      <td>b'Why the Pentagon Thinks Attacking Iran is a ...</td>\n",
       "      <td>b'Caucasus in crisis: Georgia invades South Os...</td>\n",
       "      <td>b'Indian shoe manufactory  - And again in a se...</td>\n",
       "      <td>b'Visitors Suffering from Mental Illnesses Ban...</td>\n",
       "      <td>b\"No Help for Mexico's Kidnapping Surge\"</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2008-08-11</td>\n",
       "      <td>1</td>\n",
       "      <td>b'Why wont America and Nato help us? If they w...</td>\n",
       "      <td>b'Bush puts foot down on Georgian conflict'</td>\n",
       "      <td>b\"Jewish Georgian minister: Thanks to Israeli ...</td>\n",
       "      <td>b'Georgian army flees in disarray as Russians ...</td>\n",
       "      <td>b\"Olympic opening ceremony fireworks 'faked'\"</td>\n",
       "      <td>b'What were the Mossad with fraudulent New Zea...</td>\n",
       "      <td>b'Russia angered by Israeli military sale to G...</td>\n",
       "      <td>b'An American citizen living in S.Ossetia blam...</td>\n",
       "      <td>...</td>\n",
       "      <td>b'Israel and the US behind the Georgian aggres...</td>\n",
       "      <td>b'\"Do not believe TV, neither Russian nor Geor...</td>\n",
       "      <td>b'Riots are still going on in Montreal (Canada...</td>\n",
       "      <td>b'China to overtake US as largest manufacturer'</td>\n",
       "      <td>b'War in South Ossetia [PICS]'</td>\n",
       "      <td>b'Israeli Physicians Group Condemns State Tort...</td>\n",
       "      <td>b' Russia has just beaten the United States ov...</td>\n",
       "      <td>b'Perhaps *the* question about the Georgia - R...</td>\n",
       "      <td>b'Russia is so much better at war'</td>\n",
       "      <td>b\"So this is what it's come to: trading sex fo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2008-08-12</td>\n",
       "      <td>0</td>\n",
       "      <td>b'Remember that adorable 9-year-old who sang a...</td>\n",
       "      <td>b\"Russia 'ends Georgia operation'\"</td>\n",
       "      <td>b'\"If we had no sexual harassment we would hav...</td>\n",
       "      <td>b\"Al-Qa'eda is losing support in Iraq because ...</td>\n",
       "      <td>b'Ceasefire in Georgia: Putin Outmaneuvers the...</td>\n",
       "      <td>b'Why Microsoft and Intel tried to kill the XO...</td>\n",
       "      <td>b'Stratfor: The Russo-Georgian War and the Bal...</td>\n",
       "      <td>b\"I'm Trying to Get a Sense of This Whole Geor...</td>\n",
       "      <td>...</td>\n",
       "      <td>b'U.S. troops still in Georgia (did you know t...</td>\n",
       "      <td>b'Why Russias response to Georgia was right'</td>\n",
       "      <td>b'Gorbachev accuses U.S. of making a \"serious ...</td>\n",
       "      <td>b'Russia, Georgia, and NATO: Cold War Two'</td>\n",
       "      <td>b'Remember that adorable 62-year-old who led y...</td>\n",
       "      <td>b'War in Georgia: The Israeli connection'</td>\n",
       "      <td>b'All signs point to the US encouraging Georgi...</td>\n",
       "      <td>b'Christopher King argues that the US and NATO...</td>\n",
       "      <td>b'America: The New Mexico?'</td>\n",
       "      <td>b\"BBC NEWS | Asia-Pacific | Extinction 'by man...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2008-08-13</td>\n",
       "      <td>0</td>\n",
       "      <td>b' U.S. refuses Israel weapons to attack Iran:...</td>\n",
       "      <td>b\"When the president ordered to attack Tskhinv...</td>\n",
       "      <td>b' Israel clears troops who killed Reuters cam...</td>\n",
       "      <td>b'Britain\\'s policy of being tough on drugs is...</td>\n",
       "      <td>b'Body of 14 year old found in trunk; Latest (...</td>\n",
       "      <td>b'China has moved 10 *million* quake survivors...</td>\n",
       "      <td>b\"Bush announces Operation Get All Up In Russi...</td>\n",
       "      <td>b'Russian forces sink Georgian ships '</td>\n",
       "      <td>...</td>\n",
       "      <td>b'Elephants extinct by 2020?'</td>\n",
       "      <td>b'US humanitarian missions soon in Georgia - i...</td>\n",
       "      <td>b\"Georgia's DDOS came from US sources\"</td>\n",
       "      <td>b'Russian convoy heads into Georgia, violating...</td>\n",
       "      <td>b'Israeli defence minister: US against strike ...</td>\n",
       "      <td>b'Gorbachev: We Had No Choice'</td>\n",
       "      <td>b'Witness: Russian forces head towards Tbilisi...</td>\n",
       "      <td>b' Quarter of Russians blame U.S. for conflict...</td>\n",
       "      <td>b'Georgian president  says US military will ta...</td>\n",
       "      <td>b'2006: Nobel laureate Aleksander Solzhenitsyn...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2008-08-14</td>\n",
       "      <td>1</td>\n",
       "      <td>b'All the experts admit that we should legalis...</td>\n",
       "      <td>b'War in South Osetia - 89 pictures made by a ...</td>\n",
       "      <td>b'Swedish wrestler Ara Abrahamian throws away ...</td>\n",
       "      <td>b'Russia exaggerated the death toll in South O...</td>\n",
       "      <td>b'Missile That Killed 9 Inside Pakistan May Ha...</td>\n",
       "      <td>b\"Rushdie Condemns Random House's Refusal to P...</td>\n",
       "      <td>b'Poland and US agree to missle defense deal. ...</td>\n",
       "      <td>b'Will the Russians conquer Tblisi? Bet on it,...</td>\n",
       "      <td>...</td>\n",
       "      <td>b'Bank analyst forecast Georgian crisis 2 days...</td>\n",
       "      <td>b\"Georgia confict could set back Russia's US r...</td>\n",
       "      <td>b'War in the Caucasus is as much the product o...</td>\n",
       "      <td>b'\"Non-media\" photos of South Ossetia/Georgia ...</td>\n",
       "      <td>b'Georgian TV reporter shot by Russian sniper ...</td>\n",
       "      <td>b'Saudi Arabia: Mother moves to block child ma...</td>\n",
       "      <td>b'Taliban wages war on humanitarian aid workers'</td>\n",
       "      <td>b'Russia: World  \"can forget about\" Georgia\\'s...</td>\n",
       "      <td>b'Darfur rebels accuse Sudan of mounting major...</td>\n",
       "      <td>b'Philippines : Peace Advocate say Muslims nee...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 27 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         Date  Label                                               Top1  \\\n",
       "0  2008-08-08      0  b\"Georgia 'downs two Russian warplanes' as cou...   \n",
       "1  2008-08-11      1  b'Why wont America and Nato help us? If they w...   \n",
       "2  2008-08-12      0  b'Remember that adorable 9-year-old who sang a...   \n",
       "3  2008-08-13      0  b' U.S. refuses Israel weapons to attack Iran:...   \n",
       "4  2008-08-14      1  b'All the experts admit that we should legalis...   \n",
       "\n",
       "                                                Top2  \\\n",
       "0            b'BREAKING: Musharraf to be impeached.'   \n",
       "1        b'Bush puts foot down on Georgian conflict'   \n",
       "2                 b\"Russia 'ends Georgia operation'\"   \n",
       "3  b\"When the president ordered to attack Tskhinv...   \n",
       "4  b'War in South Osetia - 89 pictures made by a ...   \n",
       "\n",
       "                                                Top3  \\\n",
       "0  b'Russia Today: Columns of troops roll into So...   \n",
       "1  b\"Jewish Georgian minister: Thanks to Israeli ...   \n",
       "2  b'\"If we had no sexual harassment we would hav...   \n",
       "3  b' Israel clears troops who killed Reuters cam...   \n",
       "4  b'Swedish wrestler Ara Abrahamian throws away ...   \n",
       "\n",
       "                                                Top4  \\\n",
       "0  b'Russian tanks are moving towards the capital...   \n",
       "1  b'Georgian army flees in disarray as Russians ...   \n",
       "2  b\"Al-Qa'eda is losing support in Iraq because ...   \n",
       "3  b'Britain\\'s policy of being tough on drugs is...   \n",
       "4  b'Russia exaggerated the death toll in South O...   \n",
       "\n",
       "                                                Top5  \\\n",
       "0  b\"Afghan children raped with 'impunity,' U.N. ...   \n",
       "1      b\"Olympic opening ceremony fireworks 'faked'\"   \n",
       "2  b'Ceasefire in Georgia: Putin Outmaneuvers the...   \n",
       "3  b'Body of 14 year old found in trunk; Latest (...   \n",
       "4  b'Missile That Killed 9 Inside Pakistan May Ha...   \n",
       "\n",
       "                                                Top6  \\\n",
       "0  b'150 Russian tanks have entered South Ossetia...   \n",
       "1  b'What were the Mossad with fraudulent New Zea...   \n",
       "2  b'Why Microsoft and Intel tried to kill the XO...   \n",
       "3  b'China has moved 10 *million* quake survivors...   \n",
       "4  b\"Rushdie Condemns Random House's Refusal to P...   \n",
       "\n",
       "                                                Top7  \\\n",
       "0  b\"Breaking: Georgia invades South Ossetia, Rus...   \n",
       "1  b'Russia angered by Israeli military sale to G...   \n",
       "2  b'Stratfor: The Russo-Georgian War and the Bal...   \n",
       "3  b\"Bush announces Operation Get All Up In Russi...   \n",
       "4  b'Poland and US agree to missle defense deal. ...   \n",
       "\n",
       "                                                Top8  \\\n",
       "0  b\"The 'enemy combatent' trials are nothing but...   \n",
       "1  b'An American citizen living in S.Ossetia blam...   \n",
       "2  b\"I'm Trying to Get a Sense of This Whole Geor...   \n",
       "3             b'Russian forces sink Georgian ships '   \n",
       "4  b'Will the Russians conquer Tblisi? Bet on it,...   \n",
       "\n",
       "                         ...                          \\\n",
       "0                        ...                           \n",
       "1                        ...                           \n",
       "2                        ...                           \n",
       "3                        ...                           \n",
       "4                        ...                           \n",
       "\n",
       "                                               Top16  \\\n",
       "0  b'Georgia Invades South Ossetia - if Russia ge...   \n",
       "1  b'Israel and the US behind the Georgian aggres...   \n",
       "2  b'U.S. troops still in Georgia (did you know t...   \n",
       "3                      b'Elephants extinct by 2020?'   \n",
       "4  b'Bank analyst forecast Georgian crisis 2 days...   \n",
       "\n",
       "                                               Top17  \\\n",
       "0                b'Al-Qaeda Faces Islamist Backlash'   \n",
       "1  b'\"Do not believe TV, neither Russian nor Geor...   \n",
       "2       b'Why Russias response to Georgia was right'   \n",
       "3  b'US humanitarian missions soon in Georgia - i...   \n",
       "4  b\"Georgia confict could set back Russia's US r...   \n",
       "\n",
       "                                               Top18  \\\n",
       "0  b'Condoleezza Rice: \"The US would not act to p...   \n",
       "1  b'Riots are still going on in Montreal (Canada...   \n",
       "2  b'Gorbachev accuses U.S. of making a \"serious ...   \n",
       "3             b\"Georgia's DDOS came from US sources\"   \n",
       "4  b'War in the Caucasus is as much the product o...   \n",
       "\n",
       "                                               Top19  \\\n",
       "0  b'This is a busy day:  The European Union has ...   \n",
       "1    b'China to overtake US as largest manufacturer'   \n",
       "2         b'Russia, Georgia, and NATO: Cold War Two'   \n",
       "3  b'Russian convoy heads into Georgia, violating...   \n",
       "4  b'\"Non-media\" photos of South Ossetia/Georgia ...   \n",
       "\n",
       "                                               Top20  \\\n",
       "0  b\"Georgia will withdraw 1,000 soldiers from Ir...   \n",
       "1                     b'War in South Ossetia [PICS]'   \n",
       "2  b'Remember that adorable 62-year-old who led y...   \n",
       "3  b'Israeli defence minister: US against strike ...   \n",
       "4  b'Georgian TV reporter shot by Russian sniper ...   \n",
       "\n",
       "                                               Top21  \\\n",
       "0  b'Why the Pentagon Thinks Attacking Iran is a ...   \n",
       "1  b'Israeli Physicians Group Condemns State Tort...   \n",
       "2          b'War in Georgia: The Israeli connection'   \n",
       "3                     b'Gorbachev: We Had No Choice'   \n",
       "4  b'Saudi Arabia: Mother moves to block child ma...   \n",
       "\n",
       "                                               Top22  \\\n",
       "0  b'Caucasus in crisis: Georgia invades South Os...   \n",
       "1  b' Russia has just beaten the United States ov...   \n",
       "2  b'All signs point to the US encouraging Georgi...   \n",
       "3  b'Witness: Russian forces head towards Tbilisi...   \n",
       "4   b'Taliban wages war on humanitarian aid workers'   \n",
       "\n",
       "                                               Top23  \\\n",
       "0  b'Indian shoe manufactory  - And again in a se...   \n",
       "1  b'Perhaps *the* question about the Georgia - R...   \n",
       "2  b'Christopher King argues that the US and NATO...   \n",
       "3  b' Quarter of Russians blame U.S. for conflict...   \n",
       "4  b'Russia: World  \"can forget about\" Georgia\\'s...   \n",
       "\n",
       "                                               Top24  \\\n",
       "0  b'Visitors Suffering from Mental Illnesses Ban...   \n",
       "1                 b'Russia is so much better at war'   \n",
       "2                        b'America: The New Mexico?'   \n",
       "3  b'Georgian president  says US military will ta...   \n",
       "4  b'Darfur rebels accuse Sudan of mounting major...   \n",
       "\n",
       "                                               Top25  \n",
       "0           b\"No Help for Mexico's Kidnapping Surge\"  \n",
       "1  b\"So this is what it's come to: trading sex fo...  \n",
       "2  b\"BBC NEWS | Asia-Pacific | Extinction 'by man...  \n",
       "3  b'2006: Nobel laureate Aleksander Solzhenitsyn...  \n",
       "4  b'Philippines : Peace Advocate say Muslims nee...  \n",
       "\n",
       "[5 rows x 27 columns]"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#1) 设置工作路径,划分数据集\n",
    "mypath = 'F:\\Downloads\\lecture02'\n",
    "os.chdir(mypath)\n",
    "data = pd.read_csv('Combined_News_DJIA.csv')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "#划分训练集与测试集\n",
    "train = data[data['Date'] < '2015-01-01']\n",
    "test = data[data['Date'] < '2014-12-31']\n",
    "\n",
    "#选取从第二列到最后一列为X_train,\n",
    "X_train = train[train.columns[2:]]\n",
    "#自己定义语料库：将选中的每条新闻 -> 单独的句子,集合在一起，长度为1611*25=40275个句子, 用flatten()把多维数组变为一维，默认按行\n",
    "corpus = X_train.values.flatten().astype(str)\n",
    "\n",
    "#【划分X_train】取出训练集, 一共1611行，每行用[xx,xx,xx，..] 25条XX表示里面的新闻内容，array格式\n",
    "X_train = X_train.values.astype(str)\n",
    "#对每一行，把25条内容合并为一条, X_train现在每行是由25条新闻组成的一个句子\n",
    "X_train = np.array([' '.join(x) for x in X_train])\n",
    "\n",
    "#【划分X_test】\n",
    "X_test = test[test.columns[2:]]\n",
    "X_test = X_test.values.astype(str)\n",
    "X_test = np.array([' '.join(x) for x in X_test])\n",
    "\n",
    "#【划分y_train】\n",
    "y_train = train['Label'].values\n",
    "\n",
    "#【划分y_test】\n",
    "y_test = test['Label'].values\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "#把语料corpus（40275句新闻）打印前两个出来看看, 打印前2个X_train出来看看\n",
    "#corpus[:2]\n",
    "#X_train[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#2）分词操作： 把句子分词nltk.tokenize， 现在语料库都是单词，X_train中现在25条新闻组成的句子的分词，\n",
    "from nltk.tokenize import word_tokenize\n",
    "corpus = [word_tokenize(x) for x in corpus]\n",
    "X_train = [word_tokenize(x) for x in X_train]\n",
    "X_test = [word_tokenize(x) for x in X_test]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "#打印出来看下\n",
    "#X_train[:1]\n",
    "#len(X_train[:1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#3） 预处理，打包为一个函数\n",
    "#•小写化\n",
    "#•删除停止词\n",
    "#•删除数字与符号\n",
    "#•lemma\n",
    "\n",
    "#去除停用词\n",
    "from nltk.corpus import stopwords\n",
    "stop = stopwords.words('english')\n",
    "#去除数字，正则表达式\n",
    "import re\n",
    "def hasNumbers(inputString):\n",
    "    return bool(re.search(r'\\d', inputString))\n",
    "#特殊符号\n",
    "def isSymbol(inputString):\n",
    "    return bool(re.match(r'[^\\w]', inputString))\n",
    "#lemma提取词干\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "wordnet_lemmatizer = WordNetLemmatizer()\n",
    "\n",
    "#定义check函数, 检查每一行中的每个单词是否ok\n",
    "def check(word):\n",
    "    if word in stop:\n",
    "        return False\n",
    "    elif hasNumbers(word) or isSymbol(word):\n",
    "        return False\n",
    "    else:\n",
    "        return True\n",
    "    \n",
    "#定义预处理函数preprocessing\n",
    "def preprocessing(sen):\n",
    "    res = []\n",
    "    for word in sen: #对每一个整体的array数组\n",
    "        if check(word): #对每一行中的每一个单词，如果check(word)返回True\n",
    "            #最小化单词，同时去除一些多余的干扰标识\n",
    "            word = word.lower().replace(\"b'\", '').replace('b\"', '').replace('\"', '').replace(\"'\", '')\n",
    "            #进一步提取词干，利用append加入新的数组\n",
    "            res.append(wordnet_lemmatizer.lemmatize(word))\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 处理已有的语料，X_train, X_test\n",
    "corpus = [preprocessing(x) for x in corpus]\n",
    "X_train = [preprocessing(x) for x in X_train]\n",
    "X_test = [preprocessing(x) for x in X_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['b', 'georgia', 'two', 'russian', 'warplane', 'country', 'move', 'brink', 'war', 'breaking', 'musharraf', 'impeached', 'russia', 'today', 'column', 'troop', 'roll', 'south', 'ossetia', 'footage', 'fighting', 'youtube', 'russian', 'tank', 'moving', 'towards', 'capital', 'south', 'ossetia', 'reportedly', 'completely', 'destroyed', 'georgian', 'artillery', 'fire', 'b', 'afghan', 'child', 'raped', 'u.n.', 'official', 'say', 'sick', 'three', 'year', 'old', 'raped', 'nothing', 'russian', 'tank', 'entered', 'south', 'ossetia', 'whilst', 'georgia', 'shoot', 'two', 'russian', 'jet', 'b', 'breaking', 'georgia', 'invades', 'south', 'ossetia', 'russia', 'warned', 'would', 'intervene', 'so', 'side', 'b', 'the', 'combatent', 'trial', 'nothing', 'sham', 'salim', 'haman', 'sentenced', 'year', 'kept', 'longer', 'anyway', 'feel', 'like', 'georgian', 'troop', 'retreat', 's.', 'osettain', 'capital', 'presumably', 'leaving', 'several', 'hundred', 'people', 'killed', 'video', 'did', 'u.s.', 'prep', 'georgia', 'war', 'russia', 'rice', 'give', 'green', 'light', 'israel', 'attack', 'iran', 'say', 'u.s.', 'veto', 'israeli', 'military', 'ops', 'announcing', 'class', 'action', 'lawsuit', 'behalf', 'american', 'public', 'against', 'fbi', 'b', 'so', 'georgia', 'war', 'nyt', 'top', 'story', 'opening', 'ceremony', 'olympics', 'what', 'fucking', 'disgrace', 'yet', 'proof', 'decline', 'journalism', 'b', 'china', 'tell', 'bush', 'stay', 'country', 'affair', 'did', 'world', 'war', 'iii', 'start', 'today', 'georgia', 'invades', 'south', 'ossetia', 'russia', 'get', 'involved', 'nato', 'absorb', 'georgia', 'unleash', 'full', 'scale', 'war', 'al-qaeda', 'face', 'islamist', 'backlash', 'condoleezza', 'rice', 'the', 'u', 'would', 'act', 'prevent', 'israeli', 'strike', 'iran', 'israeli', 'defense', 'minister', 'ehud', 'barak', 'israel', 'prepared', 'uncompromising', 'victory', 'case', 'military', 'hostility', 'this', 'busy', 'day', 'the', 'european', 'union', 'approved', 'new', 'sanction', 'iran', 'protest', 'nuclear', 'programme', 'b', 'georgia', 'withdraw', 'soldier', 'iraq', 'help', 'fight', 'russian', 'force', 'georgia', 'breakaway', 'region', 'south', 'ossetia', 'why', 'pentagon', 'think', 'attacking', 'iran', 'bad', 'idea', 'u', 'news', 'amp', 'world', 'report', 'caucasus', 'crisis', 'georgia', 'invades', 'south', 'ossetia', 'indian', 'shoe', 'manufactory', 'and', 'series', 'like', 'work', 'visitor', 'suffering', 'mental', 'illness', 'banned', 'olympics', 'b', 'no', 'help', 'mexico', 'kidnapping', 'surge']\n",
      "['b', 'georgia', 'two', 'russian', 'warplane', 'country', 'move', 'brink', 'war']\n"
     ]
    }
   ],
   "source": [
    "#打印出来看看，处理好的数据，格式就是list of list, \n",
    "print(X_train[0]) #注意这里的不同，而X_train中是把25条新闻组合在一起后再进行分词（基于每天25条新闻分割）\n",
    "print(corpus[0])  #corpus是以每条新闻计数的，然后在打散(基于每条新闻分割)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "#4)训练NLP模型\n",
    "#目前我们获得了干净的数据集合,有了人造的语料库，\n",
    "#先采用Word2Vec，【此处需要了解下参数内容】，相当于转化为词向量\n",
    "\n",
    "from gensim.models.word2vec import Word2Vec\n",
    "model = Word2Vec(corpus, size=128, window=5, min_count=5, workers=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.11783312,  0.02691937,  0.09941112, -0.45368245,  0.33183682,\n",
       "        0.32597959, -0.07337862,  0.02844724,  0.03279847,  0.42592683,\n",
       "       -0.06710878, -0.29577434, -0.15217425,  0.31164819,  0.13869938,\n",
       "       -0.25164083,  0.15452217, -0.06079166,  0.25306261, -0.12885907,\n",
       "        0.01831123,  0.13781086, -0.05696175, -0.48023048, -0.04177338,\n",
       "        0.14853723,  0.06455724,  0.1141708 ,  0.15943082,  0.01353963,\n",
       "        0.15002282,  0.27767774, -0.45112935,  0.53042918, -0.17026116,\n",
       "       -0.29705769, -0.2334096 , -0.06159178,  0.25846547, -0.39816603,\n",
       "        0.18050121, -0.38905147,  0.14459966, -0.16737907,  0.23966882,\n",
       "       -0.19768161, -0.18540657,  0.19811314,  0.18453735,  0.08856159,\n",
       "       -0.05768959, -0.05436996,  0.09857335,  0.10677049,  0.14429437,\n",
       "       -0.00893751,  0.21400209, -0.11511846,  0.27235582, -0.07288903,\n",
       "       -0.50735539,  0.04730163,  0.04856673, -0.10819522,  0.02564356,\n",
       "        0.11918142, -0.18032908,  0.14434716,  0.51784837, -0.26632002,\n",
       "       -0.08457466,  0.30157652, -0.36694938,  0.42455706, -0.12455811,\n",
       "        0.19020338,  0.11227325,  0.25110865, -0.03063117, -0.24869199,\n",
       "       -0.03613068,  0.0837339 ,  0.1698038 , -0.02521797,  0.02681969,\n",
       "        0.37542969,  0.24930295, -0.14268014,  0.3297506 , -0.095269  ,\n",
       "       -0.1895607 , -0.36985397,  0.21006575,  0.08162557, -0.30286109,\n",
       "        0.18520255, -0.24382019, -0.15660878,  0.16090035, -0.12838386,\n",
       "       -0.01196278, -0.19863351, -0.12404531,  0.11658302, -0.17156328,\n",
       "        0.26594517, -0.02128383, -0.18195786,  0.11676721,  0.13624339,\n",
       "       -0.12507033, -0.12692674, -0.09310919,  0.02519899, -0.09987924,\n",
       "        0.0505043 ,  0.2747488 ,  0.43203905, -0.27049553, -0.16357426,\n",
       "        0.18846688,  0.13633212, -0.20426881,  0.5343926 ,  0.45412546,\n",
       "       -0.49960339, -0.11301432,  0.17539966], dtype=float32)"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#打印出来试试, 注意格式问题\n",
    "model.wv['victory']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "#5)训练NLP模型完成后，用NLP模型表达我们的X\n",
    "#我们的vec是基于每个单词的，怎么办呢？由于我们文本本身的量很小，我们可以把所有的单词的vector拿过来取个平均值：\n",
    "#定义一个函数，输入一个任意word_list,得到他们的平均vector值的方法\n",
    "\n",
    "#先获取所有的vocabulary,注意此处Genism移除了之前的model.vocab,需要替换为model.wv.voccab\n",
    "vocab = model.wv.vocab\n",
    "#得到任意text的vector\n",
    "def get_vector(word_list):\n",
    "    #新建全为0的array\n",
    "    res = np.zeros([128])\n",
    "    count=0\n",
    "    for word in word_list:\n",
    "        res += model[word]\n",
    "        count += 1\n",
    "    return res/count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\ipykernel_launcher.py:13: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([  1.35594710e-01,   4.78771497e-03,   2.08560316e-02,\n",
       "        -3.06096678e-01,   3.00854318e-01,   2.74248731e-01,\n",
       "        -9.45481402e-02,   3.24749298e-02,   2.60592854e-02,\n",
       "         2.72211928e-01,  -5.20233473e-02,  -2.49901214e-01,\n",
       "        -1.35818139e-01,   2.30652633e-01,   1.17855432e-01,\n",
       "        -1.58467300e-01,   1.31108695e-01,  -5.22524355e-02,\n",
       "         1.56251449e-01,  -1.56735947e-01,   4.10375485e-02,\n",
       "         7.54008744e-02,  -5.78824667e-02,  -4.36764856e-01,\n",
       "        -5.68012707e-05,   1.66545488e-01,   9.81779173e-02,\n",
       "         1.13883892e-01,   1.24993741e-01,  -2.06849872e-02,\n",
       "         5.01941387e-02,   1.95990811e-01,  -3.10149046e-01,\n",
       "         4.19034262e-01,  -1.97779381e-01,  -2.29321394e-01,\n",
       "        -1.60880928e-01,  -5.14058918e-03,   2.68698773e-01,\n",
       "        -4.23572254e-01,   1.37644243e-01,  -4.04576414e-01,\n",
       "         1.73564543e-01,  -7.31670639e-02,   1.70787470e-01,\n",
       "        -1.12505922e-01,  -5.55463632e-02,   1.32595204e-01,\n",
       "         2.16302520e-01,   4.20391356e-02,  -1.23759378e-01,\n",
       "        -8.09864616e-02,  -2.05495325e-03,   6.97735680e-02,\n",
       "         1.47720334e-01,  -7.63727450e-02,   2.39388269e-01,\n",
       "        -7.67687598e-02,   2.00291507e-01,  -8.81477974e-02,\n",
       "        -3.93753910e-01,   1.00739359e-01,   5.26666567e-02,\n",
       "        -1.40494367e-01,   1.17330039e-03,  -2.24552644e-02,\n",
       "        -1.65599346e-01,   1.20976103e-01,   4.09649048e-01,\n",
       "        -2.19880165e-01,  -7.02509854e-02,   1.95477608e-01,\n",
       "        -3.62506120e-01,   3.79328108e-01,  -1.07548244e-01,\n",
       "         1.71272406e-01,   1.46250422e-01,   1.41453651e-01,\n",
       "         1.45588263e-02,  -1.41247246e-01,  -8.75408009e-02,\n",
       "         5.01966634e-02,   1.22938755e-01,   1.86772784e-02,\n",
       "         8.80867559e-03,   2.70356638e-01,   1.37273008e-01,\n",
       "        -1.10670625e-01,   2.08382802e-01,  -1.26689192e-02,\n",
       "        -1.86804225e-01,  -3.24974196e-01,   7.92396674e-02,\n",
       "        -1.56744923e-02,  -2.80156194e-01,   1.99330212e-01,\n",
       "        -1.86731101e-01,  -9.07457354e-02,   6.20238125e-02,\n",
       "        -1.68718236e-01,  -1.40722798e-02,  -1.92243535e-01,\n",
       "        -1.21211178e-01,   9.78650190e-02,  -1.16668285e-01,\n",
       "         2.48260579e-01,  -5.26895264e-02,  -1.53200625e-01,\n",
       "         4.24648374e-03,   9.68384523e-02,  -1.63202813e-01,\n",
       "        -1.24868181e-01,  -1.73976895e-02,   4.55888465e-02,\n",
       "        -2.10220700e-01,  -1.12207308e-02,   1.97250958e-01,\n",
       "         3.94483970e-01,  -1.80249821e-01,  -1.06334932e-01,\n",
       "         1.23598279e-01,   1.32076906e-01,  -2.86310395e-01,\n",
       "         3.86937206e-01,   3.83432448e-01,  -3.85571079e-01,\n",
       "        -8.39275474e-02,   1.16886693e-01])"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#这样，我们可以同步把我们的X都给转化成128维的一个vector list\n",
    "get_vector(['hello', 'from', 'the', 'other', 'side'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\ipykernel_launcher.py:13: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "ename": "KeyError",
     "evalue": "\"word 'impeached' not in vocabulary\"",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-98-9cd1697cc9dd>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mwordlist_test\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mX_train\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mget_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mX_train\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m \u001b[0mX_test\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mget_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<ipython-input-98-9cd1697cc9dd>\u001b[0m in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mwordlist_test\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mX_train\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mget_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mX_train\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m \u001b[0mX_test\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mget_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<ipython-input-96-aea8a32b1593>\u001b[0m in \u001b[0;36mget_vector\u001b[1;34m(word_list)\u001b[0m\n\u001b[0;32m     11\u001b[0m     \u001b[0mcount\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mword_list\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 13\u001b[1;33m         \u001b[0mres\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     14\u001b[0m         \u001b[0mcount\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     15\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0mres\u001b[0m\u001b[1;33m/\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\gensim\\utils.py\u001b[0m in \u001b[0;36mnew_func1\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m   1396\u001b[0m                     \u001b[0mstacklevel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1397\u001b[0m                 )\n\u001b[1;32m-> 1398\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1399\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1400\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mnew_func1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\gensim\\models\\word2vec.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, words)\u001b[0m\n\u001b[0;32m    819\u001b[0m         \u001b[0mRefer\u001b[0m \u001b[0mto\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdocumentation\u001b[0m \u001b[1;32mfor\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mgensim\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodels\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkeyedvectors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mWord2VecKeyedVectors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    820\u001b[0m         \"\"\"\n\u001b[1;32m--> 821\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    822\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    823\u001b[0m     \u001b[1;33m@\u001b[0m\u001b[0mdeprecated\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Method will be removed in 4.0.0, use self.wv.__contains__() instead\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\gensim\\models\\keyedvectors.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, entities)\u001b[0m\n\u001b[0;32m    167\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mentities\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    168\u001b[0m             \u001b[1;31m# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 169\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mentities\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    170\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    171\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mvstack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mentity\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mentity\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mentities\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\gensim\\models\\keyedvectors.py\u001b[0m in \u001b[0;36mget_vector\u001b[1;34m(self, word)\u001b[0m\n\u001b[0;32m    275\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    276\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mget_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 277\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mword_vec\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    278\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    279\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mwords_closer_than\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mw1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mw2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\gensim\\models\\keyedvectors.py\u001b[0m in \u001b[0;36mword_vec\u001b[1;34m(self, word, use_norm)\u001b[0m\n\u001b[0;32m    272\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    273\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 274\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"word '%s' not in vocabulary\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    275\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    276\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mget_vector\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mword\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: \"word 'impeached' not in vocabulary\""
     ]
    }
   ],
   "source": [
    "wordlist_train = X_train\n",
    "wordlist_test = X_test\n",
    "\n",
    "#KeyError: \"word 'impeached' not in vocabulary\" 出现错误，这里转换失败\n",
    "X_train = [get_vector(x) for x in X_train] \n",
    "X_test = [get_vector(x) for x in X_test]\n",
    "\n",
    "print(X_train[10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "setting an array element with a sequence.",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-101-986115155a01>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     10\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mparam\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     11\u001b[0m     \u001b[0mclf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSVR\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgamma\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparam\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 12\u001b[1;33m     \u001b[0mtest_score\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcross_val_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mclf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'roc_auc'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     13\u001b[0m     \u001b[0mtest_scores\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtest_score\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     14\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\u001b[0m in \u001b[0;36mcross_val_score\u001b[1;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)\u001b[0m\n\u001b[0;32m    340\u001b[0m                                 \u001b[0mn_jobs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mverbose\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    341\u001b[0m                                 \u001b[0mfit_params\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 342\u001b[1;33m                                 pre_dispatch=pre_dispatch)\n\u001b[0m\u001b[0;32m    343\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0mcv_results\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'test_score'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    344\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\u001b[0m in \u001b[0;36mcross_validate\u001b[1;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)\u001b[0m\n\u001b[0;32m    204\u001b[0m             \u001b[0mfit_params\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreturn_train_score\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mreturn_train_score\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    205\u001b[0m             return_times=True)\n\u001b[1;32m--> 206\u001b[1;33m         for train, test in cv.split(X, y, groups))\n\u001b[0m\u001b[0;32m    207\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    208\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mreturn_train_score\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m    777\u001b[0m             \u001b[1;31m# was dispatched. In particular this covers the edge\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    778\u001b[0m             \u001b[1;31m# case of Parallel used with an exhausted iterator.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 779\u001b[1;33m             \u001b[1;32mwhile\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdispatch_one_batch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    780\u001b[0m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    781\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36mdispatch_one_batch\u001b[1;34m(self, iterator)\u001b[0m\n\u001b[0;32m    623\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    624\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 625\u001b[1;33m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_dispatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    626\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    627\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m_dispatch\u001b[1;34m(self, batch)\u001b[0m\n\u001b[0;32m    586\u001b[0m         \u001b[0mdispatch_timestamp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    587\u001b[0m         \u001b[0mcb\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBatchCompletionCallBack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdispatch_timestamp\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 588\u001b[1;33m         \u001b[0mjob\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply_async\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    589\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jobs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    590\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\externals\\joblib\\_parallel_backends.py\u001b[0m in \u001b[0;36mapply_async\u001b[1;34m(self, func, callback)\u001b[0m\n\u001b[0;32m    109\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mapply_async\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    110\u001b[0m         \u001b[1;34m\"\"\"Schedule a func to be run\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 111\u001b[1;33m         \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mImmediateResult\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    112\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    113\u001b[0m             \u001b[0mcallback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\externals\\joblib\\_parallel_backends.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, batch)\u001b[0m\n\u001b[0;32m    330\u001b[0m         \u001b[1;31m# Don't delay the application, to avoid keeping the input\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    331\u001b[0m         \u001b[1;31m# arguments in memory\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 332\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    333\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    334\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    129\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    130\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 131\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    132\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    133\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m    129\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    130\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 131\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    132\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    133\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\u001b[0m in \u001b[0;36m_fit_and_score\u001b[1;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)\u001b[0m\n\u001b[0;32m    456\u001b[0m             \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    457\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 458\u001b[1;33m             \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    459\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    460\u001b[0m     \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\svm\\base.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m    147\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sparse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msparse\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mcallable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    148\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 149\u001b[1;33m         \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'C'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'csr'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    150\u001b[0m         \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_targets\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    151\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[1;34m(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)\u001b[0m\n\u001b[0;32m    571\u001b[0m     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,\n\u001b[0;32m    572\u001b[0m                     \u001b[0mensure_2d\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mallow_nd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mensure_min_samples\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 573\u001b[1;33m                     ensure_min_features, warn_on_dtype, estimator)\n\u001b[0m\u001b[0;32m    574\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    575\u001b[0m         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,\n",
      "\u001b[1;32mD:\\Software\\annacoda 5.01\\installation\\lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[0;32m    431\u001b[0m                                       force_all_finite)\n\u001b[0;32m    432\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 433\u001b[1;33m         \u001b[0marray\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    434\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    435\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: setting an array element with a sequence."
     ]
    }
   ],
   "source": [
    "#6) 建立ML模型\n",
    "#这里，因为我们128维的每一个值都是连续关系的。不是分裂开考虑的。所以，道理上讲，我们是不太适合用RandomForest这类把每个column当做单独的variable来看的方法。（当然，事实是，你也可以这么用）\n",
    "#我们来看看比较适合连续函数的方法：SVM\n",
    "\n",
    "from sklearn.svm import SVR\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "params = [0.1,0.5,1,3,5,7,10,12,16,20,25,30,35,40]\n",
    "test_scores = []\n",
    "for param in params:\n",
    "    clf = SVR(gamma = param)\n",
    "    test_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')\n",
    "    test_scores.append(np.mean(test_score))\n",
    "\n",
    "#画图结果\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "plt.plot(params, test_scores)\n",
    "plt.title(\"Param vs CV AUC Score\");\n",
    "\n",
    "#后期使用CNN方法\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
